This workflow reanalyze the single nucleus RNA-seq data produced by (Habib et al. 2017), using DroNc-seq: massively parallel sNuc-seq with droplet technology.
We will first load a few libraries. Among them, * DropletUtils provides functions for data from droplet technologies such as 10X Genomics. * biomaRt provides easy access to databases, such as Ensembl, COSMIC, Uniprot, HGNC, etc. * scater is a collection of tools for doing quality control analyses of scRNA-seq * scran provide functions for normalization of cell-specific libary sizes, correcting batch effects, and identification marker genes
bio_packs = c("SingleCellExperiment","DropletUtils","biomaRt",
"scater","scran","limma","org.Hs.eg.db","SC3")
source("https://bioconductor.org/biocLite.R")
for(pack1 in bio_packs){
if( !pack1 %in% installed.packages()[,"Package"]){
biocLite(pack1, suppressUpdates = TRUE)
}
}
cran_packs = c("data.table","svd","Rtsne","stringi","irlba")
for(pack1 in cran_packs){
if( !pack1 %in% installed.packages()[,"Package"]){
install.packages(pack1)
}
}
library(data.table)
library(SingleCellExperiment)
library(DropletUtils)
library(biomaRt)
library(scater)
library(scran)
library(limma)
library(ggplot2)We setup different directories, and set num_cores improve SC3 runtime, if run_sc3 is TRUE. The step of runing SC clustering can take five or more hours without using more cores.
repo_dir = "~/research/GitHub/scRNAseq_pipelines"
work_dir = file.path(repo_dir,"dronc")
dronc_dir = "~/research/scRNAseq/data/GTEx_droncseq_hip_pcf"
# repo_dir = "/pine/scr/p/l/pllittle/CS_eQTL/s3_Real/scRNAseq_pipelines"
# work_dir = file.path(repo_dir,"dronc")
# dronc_dir = file.path(work_dir,"GTEx_droncseq_hip_pcf")
source(file.path(repo_dir,"SOURCE.R"))
run_sc3 = TRUE
num_cores = ifelse(run_sc3,7,1)Next we import in the count data and other available information. The dataset is available here.
counts_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.umi_counts.txt.gz")
counts = fread(cmd=sprintf("zcat < %s",counts_fn),data.table = FALSE)
dim(counts); counts[1:3,1:2]## [1] 32111 14964
## V1 hHP1_AACACTATCTAC
## 1 A1BG 0
## 2 A1BG-AS1 0
## 3 A1CF 0
rownames(counts) = counts$V1
counts = as.matrix(counts[,-1])
colnames(counts)[1:10]## [1] "hHP1_AACACTATCTAC" "hHP1_CTACGCATCCAT" "hHP1_TCGGTACTAATA"
## [4] "hHP1_CCCGCACGCTAT" "hHP1_TCATTTTGTCAT" "hHP1_ACGAGGTCTATG"
## [7] "hHP1_AGTCATGAGGTT" "hHP1_GTTAGTATACCA" "hHP1_GCATTCAGTAAG"
## [10] "hHP1_AGACCGCGACTA"
part_cell_id = sapply(colnames(counts),
function(xx) strsplit(xx,"_")[[1]][1],
USE.NAMES=FALSE)
col_dat = smart_df(sample_name = colnames(counts),
part_cell_id = part_cell_id)
col_dat[1:5,]## sample_name part_cell_id
## 1 hHP1_AACACTATCTAC hHP1
## 2 hHP1_CTACGCATCCAT hHP1
## 3 hHP1_TCGGTACTAATA hHP1
## 4 hHP1_CCCGCACGCTAT hHP1
## 5 hHP1_TCATTTTGTCAT hHP1
Import clustering results by (Habib et al. 2017). We utilize the supplemental files to label clusters and incorporate the existing TSNE results. Refer to the file nmeth.4407-S10.xlsx.
clust_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.clusters.txt.gz")
clust = fread(cmd = sprintf("zcat < %s",clust_fn),data.table = FALSE)
dim(clust); clust[1:5,]## [1] 14963 2
## V1 V2
## 1 hHP1_AACACTATCTAC 4
## 2 hHP1_CTACGCATCCAT 3
## 3 hHP1_TCGGTACTAATA 3
## 4 hHP1_CCCGCACGCTAT 3
## 5 hHP1_TCATTTTGTCAT 3
names(clust) = c("sample_name","Habib_clusters")
clust$Habib_clusters = as.factor(clust$Habib_clusters)
# Double check sample names
all(col_dat$sample_name == clust$sample_name)## [1] TRUE
cname = c("exPFC1","exPFC2","exCA1","exCA3","GABA1","GABA2","exDG",
"ASC1","ASC2","ODC1","ODC2","OPC","MG","NSC","END",rep(NA,4))
ctype = c("exPFC","exPFC","exCA1","exCA3", "GABA","GABA","exDG","ASC",
"ASC","ODC","ODC","OPC","MG","NSC","END",rep(NA,4))
map_clust_name = smart_df(Habib_clusters = as.factor(seq(19)),
Habib_clusters_name = cname,
Habib_cell_type = ctype)
clust = smart_merge(clust,map_clust_name)
clust = clust[match(col_dat$sample_name,clust$sample_name),]
clust[1:5,]## Habib_clusters sample_name Habib_clusters_name Habib_cell_type
## 9142 4 hHP1_AACACTATCTAC exCA3 exCA3
## 8826 3 hHP1_CTACGCATCCAT exCA1 exCA1
## 8732 3 hHP1_TCGGTACTAATA exCA1 exCA1
## 8735 3 hHP1_CCCGCACGCTAT exCA1 exCA1
## 8731 3 hHP1_TCATTTTGTCAT exCA1 exCA1
We marge the TSNE results from (Habib et al. 2017) into the sce object. We also check for spike-ins, and as expected, there is no spike-ins in this dataset.
tsne_fn = file.path(dronc_dir,"GTEx_droncseq_hip_pcf.tsne.txt.gz")
df_tsne = fread(cmd = sprintf("zcat < %s",tsne_fn),data.table = FALSE)
dim(df_tsne); df_tsne[1:5,]## [1] 14963 3
## V1 tSNE_1 tSNE_2
## 1 hHP1_AACACTATCTAC -8.436537 7.721106
## 2 hHP1_CTACGCATCCAT -6.606278 8.099538
## 3 hHP1_TCGGTACTAATA -6.724967 8.011943
## 4 hHP1_CCCGCACGCTAT -6.286364 7.892359
## 5 hHP1_TCATTTTGTCAT -10.455725 2.374719
names(df_tsne) = c("sample_name",paste0("Habib_TSNE",1:2))
table(df_tsne$sample_name == clust$sample_name)##
## TRUE
## 14963
table(df_tsne$sample_name == clust$sample_name)##
## TRUE
## 14963
is_match = (col_dat$sample_name == df_tsne$sample_name) &
(df_tsne$sample_name == clust$sample_name); table(is_match)## is_match
## TRUE
## 14963
col_dat = cbind(col_dat,
clust[,names(clust) != "sample_name"],
df_tsne[,names(df_tsne) != "sample_name"])
col_dat[1:5,]## sample_name part_cell_id Habib_clusters Habib_clusters_name
## 9142 hHP1_AACACTATCTAC hHP1 4 exCA3
## 8826 hHP1_CTACGCATCCAT hHP1 3 exCA1
## 8732 hHP1_TCGGTACTAATA hHP1 3 exCA1
## 8735 hHP1_CCCGCACGCTAT hHP1 3 exCA1
## 8731 hHP1_TCATTTTGTCAT hHP1 3 exCA1
## Habib_cell_type Habib_TSNE1 Habib_TSNE2
## 9142 exCA3 -8.436537 7.721106
## 8826 exCA1 -6.606278 8.099538
## 8732 exCA1 -6.724967 8.011943
## 8735 exCA1 -6.286364 7.892359
## 8731 exCA1 -10.455725 2.374719
sce = SingleCellExperiment(assays = list(counts = counts),colData = col_dat)
sce## class: SingleCellExperiment
## dim: 32111 14963
## metadata(0):
## assays(1): counts
## rownames(32111): A1BG A1BG-AS1 ... ZZEF1 ZZZ3
## rowData names(0):
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(7): sample_name part_cell_id ... Habib_TSNE1
## Habib_TSNE2
## reducedDimNames(0):
## spikeNames(0):
rownames(sce)[grep("^ERCC",rownames(sce))]## [1] "ERCC1" "ERCC2" "ERCC3" "ERCC4" "ERCC5" "ERCC6" "ERCC6L"
## [8] "ERCC6L2" "ERCC8"
We will extract annotation information based on gene names. Since the existing count data from (Habib et al. 2017) were based on mapping to hg19, we use this version of reference genome.
anno_file = file.path(work_dir,"gene.annotation_dronc.rds")
if( file.exists(anno_file) ){
gene_anno = readRDS(anno_file)
} else{
ensembl = useEnsembl(biomart="ensembl", GRCh=37,
dataset="hsapiens_gene_ensembl")
attr_string = c("hgnc_symbol","ensembl_gene_id","external_gene_name",
"chromosome_name", "start_position","end_position","strand",
"description","percentage_gene_gc_content","gene_biotype")
gene_anno = getBM(attributes = attr_string,
filters = "external_gene_name",
values = rownames(sce),
mart = ensembl)
saveRDS(gene_anno, file=anno_file)
}
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 34526 10
We remove those genes that are part of extracted annotation, but aren’t in sce.
w2rm = which(!gene_anno$external_gene_name %in% rownames(sce))
w2rm## [1] 4232 4645
gene_anno[w2rm,]## hgnc_symbol ensembl_gene_id external_gene_name chromosome_name
## 4232 ENSG00000150076 C10ORF68 10
## 4645 ENSG00000184909 C1ORF220 1
## start_position end_position strand
## 4232 32832297 33171802 1
## 4645 178511950 178517579 1
## description
## 4232 Homo sapiens coiled-coil domain containing 7 (CCDC7), transcript variant 5, mRNA. [Source:RefSeq mRNA;Acc:NM_024688]
## 4645
## percentage_gene_gc_content gene_biotype
## 4232 37.23 protein_coding
## 4645 49.91 protein_coding
gene_anno = gene_anno[-w2rm,]
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 34524 10
Many genes have multiple entries in the annotation, often because they are annotated to scaffolds, assembly patches and alternate loci. Here we simply remove such entries. The we remove duplicated annotations and genes without annotations.
table(gene_anno$chromosome_name)[1:30]##
## 1 10 11 12 13 14
## 3084 1252 1741 1694 637 1145
## 15 16 17 18 19 2
## 1169 1426 1820 633 1955 2186
## 20 21 22 3 4 5
## 800 383 704 1784 1301 1567
## 6 7 8 9 GL000192.1 GL000193.1
## 1622 1446 1296 1207 2 1
## GL000194.1 GL000195.1 GL000199.1 GL000204.1 GL000205.1 GL000213.1
## 2 1 1 1 2 1
chr_nms = c(1:22,"X","Y","MT")
gene_anno = gene_anno[which(gene_anno$chromosome_name %in% chr_nms),]
dim(sce); dim(gene_anno)## [1] 32111 14963
## [1] 31978 10
t1 = table(gene_anno$external_gene_name)
t2 = sort(t1[t1 > 1], decreasing=TRUE)
length(t2)## [1] 40
t2[1:10]##
## SNORD113 MIR1302-2 NPIPA7 CCDC177 CDRT1
## 6 4 3 2 2
## CEBPA-AS1 FAM226B FAM47E-STBD1 GATS GOLGA7B
## 2 2 2 2 2
gene_anno[which(gene_anno$external_gene_name %in% names(t2)[1:4]), 1:4]## hgnc_symbol ensembl_gene_id external_gene_name chromosome_name
## 5013 CCDC177 ENSG00000255994 CCDC177 14
## 5014 ENSG00000267909 CCDC177 14
## 14832 MIR1302-11 ENSG00000267588 MIR1302-2 19
## 14833 MIR1302-10 ENSG00000267588 MIR1302-2 19
## 14834 MIR1302-9 ENSG00000267588 MIR1302-2 19
## 14835 MIR1302-2 ENSG00000267588 MIR1302-2 19
## 16502 ENSG00000183889 NPIPA7 16
## 16503 NPIPA7 ENSG00000214967 NPIPA7 16
## 16505 ENSG00000233024 NPIPA7 16
## 29789 ENSG00000200150 SNORD113 14
## 29810 ENSG00000201500 SNORD113 14
## 29811 ENSG00000201036 SNORD113 14
## 29812 ENSG00000201710 SNORD113 14
## 29825 ENSG00000222095 SNORD113 14
## 29826 ENSG00000222185 SNORD113 14
w_duplicate = which(gene_anno$external_gene_name %in% names(t2))
ganno2 = gene_anno[w_duplicate,]
dim(ganno2)## [1] 87 10
table(ganno2$hgnc_symbol == ganno2$external_gene_name)##
## FALSE TRUE
## 46 41
ganno2 = ganno2[which(ganno2$hgnc_symbol == ganno2$external_gene_name),]
dim(ganno2)## [1] 41 10
ganno2 = dplyr::distinct(ganno2,external_gene_name,.keep_all = TRUE)
dim(ganno2)## [1] 39 10
gene_anno = rbind(gene_anno[-w_duplicate,], ganno2)
dim(gene_anno)## [1] 31930 10
table(gene_anno$gene_biotype)##
## 3prime_overlapping_ncrna antisense IG_C_gene
## 11 4152 3
## IG_V_gene lincRNA miRNA
## 6 4807 966
## misc_RNA Mt_rRNA Mt_tRNA
## 935 2 18
## processed_transcript protein_coding pseudogene
## 394 18296 3
## rRNA sense_intronic sense_overlapping
## 204 648 172
## snoRNA snRNA TR_C_gene
## 201 1083 5
## TR_J_gene TR_V_gene
## 7 17
gene_missing = setdiff(rownames(sce),gene_anno$external_gene_name)
gene_missing[1:10]## [1] "AC005152.2" "AC006132.1" "AC006445.8" "AC007092.1" "AC007390.5"
## [6] "AC007464.1" "AC009232.2" "AC009236.1" "AC010127.5" "AC011043.1"
length(gene_missing)## [1] 181
w2kp = match(gene_anno$external_gene_name,rownames(sce))
table(is.na(w2kp))##
## FALSE
## 31930
gene_anno$external_gene_name[which(is.na(w2kp))]## character(0)
sce = sce[w2kp,]
dim(sce)## [1] 31930 14963
rowData(sce) = gene_anno
sce## class: SingleCellExperiment
## dim: 31930 14963
## metadata(0):
## assays(1): counts
## rownames(31930): AC006946.16 AC006946.17 ... ZNF8 ZNF788
## rowData names(10): hgnc_symbol ensembl_gene_id ...
## percentage_gene_gc_content gene_biotype
## colnames(14963): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CTCCATTCATGC PFC-CD_CGTCATTAGCAG
## colData names(7): sample_name part_cell_id ... Habib_TSNE1
## Habib_TSNE2
## reducedDimNames(0):
## spikeNames(0):
rowData(sce)[1:5,]## DataFrame with 5 rows and 10 columns
## hgnc_symbol ensembl_gene_id external_gene_name chromosome_name
## <character> <character> <character> <character>
## 1 ENSG00000273203 AC006946.16 22
## 2 ENSG00000273442 AC006946.17 22
## 3 ENSG00000236754 AC004019.13 22
## 4 ABHD3 ENSG00000158201 ABHD3 18
## 5 ENSG00000272682 AC004471.10 22
## start_position end_position strand
## <integer> <integer> <integer>
## 1 17548711 17551565 1
## 2 17561591 17562346 1
## 3 18062923 18071958 -1
## 4 19230858 19284766 -1
## 5 19111822 19115962 -1
## description
## <character>
## 1
## 2
## 3
## 4 abhydrolase domain containing 3 [Source:HGNC Symbol;Acc:18718]
## 5
## percentage_gene_gc_content gene_biotype
## <numeric> <character>
## 1 41.61 lincRNA
## 2 41.67 lincRNA
## 3 52.6 antisense
## 4 42.45 protein_coding
## 5 58.3 lincRNA
Please refer to this workflow in bioconductor for reference.
bcrank = barcodeRanks(counts(sce))
# Only show unique points for plotting speed.
uniq = !duplicated(bcrank$rank)
par(mar=c(5,4,2,1), bty="n")
plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy",
xlab="Rank", ylab="Total UMI count", cex=0.5, cex.lab=1.2)
abline(h=bcrank$inflection, col="darkgreen", lty=2,lwd=2)
abline(h=bcrank$knee, col="dodgerblue", lty=2,lwd=2)
legend("left", legend=c("Inflection", "Knee"), bty="n",
col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2,lwd=2)bcrank$inflection## hCf_TAATAGCGGTAC
## 221
bcrank$knee## PFC2-A2_CACAGCGCTGTC
## 580
summary(bcrank$total)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 218.0 459.0 716.0 948.9 1167.0 9423.0
table(bcrank$total >= bcrank$knee)##
## FALSE TRUE
## 5513 9450
table(bcrank$total >= bcrank$inflection)##
## FALSE TRUE
## 1 14962
set.seed(100)
date()## [1] "Thu Jan 31 16:11:35 2019"
e_out = emptyDrops(counts(sce))
date()## [1] "Thu Jan 31 16:11:55 2019"
e_out## DataFrame with 14963 rows and 5 columns
## Total LogProb PValue Limited FDR
## <integer> <numeric> <numeric> <logical> <numeric>
## hHP1_AACACTATCTAC 4484 NaN 1 FALSE 0
## hHP1_CTACGCATCCAT 4919 NaN 1 FALSE 0
## hHP1_TCGGTACTAATA 5163 NaN 1 FALSE 0
## hHP1_CCCGCACGCTAT 5030 NaN 1 FALSE 0
## hHP1_TCATTTTGTCAT 3588 NaN 1 FALSE 0
## ... ... ... ... ... ...
## PFC-CD_TTGCCTGGCGGG 590 NaN 1 FALSE 0
## PFC-CD_CACGCTCCCCTA 269 NaN 1 FALSE 1
## PFC-CD_GCTCTACAACCG 522 NaN 1 FALSE 1
## PFC-CD_CTCCATTCATGC 497 NaN 1 FALSE 1
## PFC-CD_CGTCATTAGCAG 568 NaN 1 FALSE 1
length(unique(e_out$FDR))## [1] 2
table(e_out$FDR)##
## 0 1
## 9450 5513
tapply(e_out$Total, e_out$FDR, summary)## $`0`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 581 748 1011 1268 1487 9423
##
## $`1`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 218.0 317.0 391.0 402.3 489.0 580.0
From the above analysis, some cells with very small number of UMIs. Here we chose do not remove any cells because it appears all these 14,963 cells were used in the main anlaysis by (Habib et al. 2017). Based on Figure 2 of their paper, there are
14,963 DroNc-seq nuclei profiles (each with >10,000 reads and >200 genes)
We will generate a set of QC features per cell, including the expression of mitochondira/ribosomal genes. We identify ribosomal genes based on annoation from https://www.genenames.org/.
ribo_fn = file.path(work_dir,"ribosome_genes.txt")
ribo = smart_RT(ribo_fn,sep='\t',header=TRUE)
ribo[1:2,]## HGNC.ID Approved.Symbol Approved.Name Status Previous.Symbols
## 1 10298 RPL10 ribosomal protein L10 Approved
## 2 10299 RPL10A ribosomal protein L10a Approved NEDD6
## Synonyms Chromosome Accession.Numbers
## 1 NOV, QM, DXS648E, DXS648, FLJ23544, L10 Xq28 AB007170
## 2 Csa-19, L10A 6p21.31 U12404
## RefSeq.IDs Gene.Family.Tag Gene.family.description Gene.family.ID
## 1 NM_006013 RPL L ribosomal proteins 729
## 2 NM_007104 RPL L ribosomal proteins 729
is_mito = which(rowData(sce)$chromosome_name == "MT")
is_ribo = which(rowData(sce)$external_gene_name %in% ribo$Approved.Symbol)
length(is_mito)## [1] 33
length(is_ribo)## [1] 160
sce = calculateQCMetrics(sce, feature_controls=list(Mt=is_mito, Ri=is_ribo))
sort(colnames(colData(sce)))## [1] "Habib_cell_type"
## [2] "Habib_clusters"
## [3] "Habib_clusters_name"
## [4] "Habib_TSNE1"
## [5] "Habib_TSNE2"
## [6] "is_cell_control"
## [7] "log10_total_counts"
## [8] "log10_total_counts_endogenous"
## [9] "log10_total_counts_feature_control"
## [10] "log10_total_counts_Mt"
## [11] "log10_total_counts_Ri"
## [12] "log10_total_features"
## [13] "log10_total_features_by_counts"
## [14] "log10_total_features_by_counts_endogenous"
## [15] "log10_total_features_by_counts_feature_control"
## [16] "log10_total_features_by_counts_Mt"
## [17] "log10_total_features_by_counts_Ri"
## [18] "log10_total_features_endogenous"
## [19] "log10_total_features_feature_control"
## [20] "log10_total_features_Mt"
## [21] "log10_total_features_Ri"
## [22] "part_cell_id"
## [23] "pct_counts_endogenous"
## [24] "pct_counts_feature_control"
## [25] "pct_counts_in_top_100_features"
## [26] "pct_counts_in_top_100_features_endogenous"
## [27] "pct_counts_in_top_100_features_feature_control"
## [28] "pct_counts_in_top_100_features_Ri"
## [29] "pct_counts_in_top_200_features"
## [30] "pct_counts_in_top_200_features_endogenous"
## [31] "pct_counts_in_top_50_features"
## [32] "pct_counts_in_top_50_features_endogenous"
## [33] "pct_counts_in_top_50_features_feature_control"
## [34] "pct_counts_in_top_50_features_Ri"
## [35] "pct_counts_in_top_500_features"
## [36] "pct_counts_in_top_500_features_endogenous"
## [37] "pct_counts_Mt"
## [38] "pct_counts_Ri"
## [39] "pct_counts_top_100_features"
## [40] "pct_counts_top_100_features_endogenous"
## [41] "pct_counts_top_100_features_feature_control"
## [42] "pct_counts_top_100_features_Ri"
## [43] "pct_counts_top_200_features"
## [44] "pct_counts_top_200_features_endogenous"
## [45] "pct_counts_top_50_features"
## [46] "pct_counts_top_50_features_endogenous"
## [47] "pct_counts_top_50_features_feature_control"
## [48] "pct_counts_top_50_features_Ri"
## [49] "pct_counts_top_500_features"
## [50] "pct_counts_top_500_features_endogenous"
## [51] "sample_name"
## [52] "total_counts"
## [53] "total_counts_endogenous"
## [54] "total_counts_feature_control"
## [55] "total_counts_Mt"
## [56] "total_counts_Ri"
## [57] "total_features"
## [58] "total_features_by_counts"
## [59] "total_features_by_counts_endogenous"
## [60] "total_features_by_counts_feature_control"
## [61] "total_features_by_counts_Mt"
## [62] "total_features_by_counts_Ri"
## [63] "total_features_endogenous"
## [64] "total_features_feature_control"
## [65] "total_features_Mt"
## [66] "total_features_Ri"
par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
hist(log10(sce$total_counts), xlab="log10(Library sizes)", main="",
breaks=20, col="grey80", ylab="Number of cells")
hist(log10(sce$total_features), xlab="log10(# of expressed genes)",
main="", breaks=20, col="grey80", ylab="Number of cells")
hist(sce$pct_counts_Ri, xlab="Ribosome prop. (%)",
ylab="Number of cells", breaks=40, main="", col="grey80")
hist(sce$pct_counts_Mt, xlab="Mitochondrial prop. (%)",
ylab="Number of cells", breaks=80, main="", col="grey80")par(mfrow=c(2,2), mar=c(5, 4, 1, 1), bty="n")
smoothScatter(log10(sce$total_counts), log10(sce$total_features),
xlab="log10(Library sizes)", ylab="log10(# of expressed genes)")
smoothScatter(log10(sce$total_counts), sce$pct_counts_Ri,
xlab="log10(Library sizes)", ylab="Ribosome prop. (%)")
smoothScatter(log10(sce$total_counts), sce$pct_counts_Mt,
xlab="log10(Library sizes)", ylab="Mitochondrial prop. (%)")
smoothScatter(sce$pct_counts_Ri,sce$pct_counts_Mt,
xlab="Ribosome prop. (%)", ylab="Mitochondrial prop. (%)")From the QC results, we will filter on the metrics by identifying outliers using isOutlier. Note that the cells assigned to cluster 18 by (Habib et al. 2017) will all be excluded.
libsize_drop = isOutlier(sce$total_counts,nmads = 3,type = "lower",log = TRUE)
feature_drop = isOutlier(sce$total_features_by_counts,nmads = 3,
type = "lower",log = TRUE)
mito_drop = isOutlier(sce$pct_counts_Mt,nmads = 3,type = "higher")
ribo_drop = isOutlier(sce$pct_counts_Ri,nmads = 3,type = "higher")
keep = !(libsize_drop | feature_drop | mito_drop | ribo_drop)
smart_df(ByLibSize = sum(libsize_drop),ByFeature = sum(feature_drop),
ByMito = sum(mito_drop),ByRibo = sum(ribo_drop),
Remaining = sum(keep))## ByLibSize ByFeature ByMito ByRibo Remaining
## 1 0 0 1193 735 13181
smart_table(colData(sce)$Habib_clusters,keep)## keep
## FALSE TRUE
## 1 56 3048
## 2 4 293
## 3 31 390
## 4 8 737
## 5 18 874
## 6 51 772
## 7 32 1421
## 8 126 1078
## 9 124 581
## 10 234 1484
## 11 92 1155
## 12 106 578
## 13 108 281
## 14 40 161
## 15 279 120
## 16 171 83
## 17 126 74
## 18 132 0
## 19 44 51
smart_table(colData(sce)$Habib_clusters_name,keep)## keep
## FALSE TRUE
## ASC1 126 1078
## ASC2 124 581
## END 279 120
## exCA1 31 390
## exCA3 8 737
## exDG 32 1421
## exPFC1 56 3048
## exPFC2 4 293
## GABA1 18 874
## GABA2 51 772
## MG 108 281
## NSC 40 161
## ODC1 234 1484
## ODC2 92 1155
## OPC 106 578
## <NA> 473 208
sce = sce[,keep]
dim(sce)## [1] 31930 13181
rowData(sce)[1:2,]## DataFrame with 2 rows and 21 columns
## hgnc_symbol ensembl_gene_id external_gene_name chromosome_name
## <character> <character> <character> <character>
## 1 ENSG00000273203 AC006946.16 22
## 2 ENSG00000273442 AC006946.17 22
## start_position end_position strand description
## <integer> <integer> <integer> <character>
## 1 17548711 17551565 1
## 2 17561591 17562346 1
## percentage_gene_gc_content gene_biotype is_feature_control
## <numeric> <character> <logical>
## 1 41.61 lincRNA FALSE
## 2 41.67 lincRNA FALSE
## is_feature_control_Mt is_feature_control_Ri mean_counts
## <logical> <logical> <numeric>
## 1 FALSE FALSE 0.00548018445498897
## 2 FALSE FALSE 0.00360890195816347
## log10_mean_counts n_cells_by_counts pct_dropout_by_counts total_counts
## <numeric> <integer> <numeric> <integer>
## 1 0.0023735161394708 79 99.4720310098242 82
## 2 0.00156450482886486 52 99.6524761077324 54
## log10_total_counts n_cells_counts pct_dropout_counts
## <numeric> <integer> <numeric>
## 1 1.91907809237607 79 99.4720310098242
## 2 1.74036268949424 52 99.6524761077324
summary(rowData(sce)$mean_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00007 0.00033 0.00287 0.02972 0.02484 107.63510
summary(rowData(sce)$mean_counts[rowData(sce)$mean_counts>0])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00007 0.00033 0.00287 0.02972 0.02484 107.63510
summary(rowData(sce)$n_cells_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 5.0 40.5 307.8 341.0 13416.0
par(mfrow=c(1,3), mar=c(5,4,1,1))
hist(log10(rowData(sce)$mean_counts+1e-6), col="grey80", main="",
breaks=40, xlab="log10(ave # of UMI + 1e-6)")
hist(log10(rowData(sce)$n_cells_counts+1), col="grey80", main="",
breaks=40, xlab="log10(# of expressed cells + 1)")
plot(log10(rowData(sce)$mean_counts+1e-6), pch=16, col=rgb(0,0,0,0.4),
log10(rowData(sce)$n_cells_counts + 1),
xlab="log10(ave # of UMI + 1e-6)",
ylab="log10(# of expressed cells + 1)")tb1 = table(rowData(sce)$n_cells_counts)
tb1[1:11]##
## 1 2 3 4 5 6 7 8 9 10 11
## 3302 1958 1280 1029 843 667 552 525 473 383 369
We remove those genes that are lowly expressed. (Habib et al. 2017) mentioned in section “Gene detection and quality controls” of Online Methods
Nuclei with less than 200 detected genes and less than 10,000 usable reads were filtered out.
and
A gene is considered detected in a cell if it has at least two unique UMIs (transcripts) associated with it. For each analysis, genes were removed that were detected in less than 10 nuclei.
Therefore it seems we should remove all the nuclei. having less than 200 genes with two or more UMI counts. However, this would remove the majority of the nuclei. Therefore, we conlcude that they meant to remove the nuclei. having less than 200 genes with one or more UMI counts. To filter genes, we follow their threshold to remove genes with two or more UMIs in less than 10 nuclei.
Note that the variable strand need to be renamed, otherwise there is an error message that such a variabel name cannot be used.
names(rowData(sce))[names(rowData(sce)) == "strand"] = "strand_n"
n_genes = colSums(counts(sce) >= 2)
summary(n_genes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.0 43.0 71.0 112.5 133.0 1555.0
table(n_genes >= 100)##
## FALSE TRUE
## 8440 4741
table(n_genes >= 200)##
## FALSE TRUE
## 11339 1842
n_genes = colSums(counts(sce) >= 1)
summary(n_genes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 198.0 351.0 540.0 670.5 845.0 4233.0
table(n_genes >= 100)##
## TRUE
## 13181
table(n_genes >= 200)##
## FALSE TRUE
## 2 13179
n_cells = rowSums(counts(sce) >= 2)
summary(n_cells)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 1.00 46.45 22.00 11725.00
table(n_cells >= 10)##
## FALSE TRUE
## 21323 10607
sce = sce[which(n_cells >= 10),]
dim(sce)## [1] 10607 13181
Next we check those highly expressed genes
par(mar=c(5,4,1,1))
od1 = order(rowData(sce)$mean_counts, decreasing = TRUE)
barplot(rowData(sce)$mean_counts[od1[20:1]], las=1,
names.arg=rowData(sce)$hgnc_symbol[od1[20:1]],
horiz=TRUE, cex.names=0.8, cex.axis=0.8,
xlab="ave # of UMI")A simple solution for normalization and stablizing expression varaince across genes is to tranform the count data by log(count/size.factor + 1). One may calcualte size.factor per cell as the total number of UMIs, and this assumes the total expression are the same across all the cells. However, the total expression of each cell may vary with respect to cell type and/or cell size, and the computeSumFactors function in R package scran provides a more sophisicated way to calcualte size.factor to allow such variaation across cells (Lun, Bach, and Marioni 2016). computeSumFactors can use initial clustering of cells to normalize expression within and beetween clusters. Within a cluster, it estimates the size factor for many groups of cells so that there are more groups than cells, and then it can calcualte the size factor per cell using a lienar deconvolution system. We remove all the cells with negative or very small size factors (< 0.01).
As shown in the following plot, the final size factor estimation is indeed highly correlated with the naive definition by total count.
Finally, the command normalize(sce) adds the normalized expression into the variable sce, which can be accessed by logcounts(sce) = log2(gene_cell_count / size_factor + 1).
date()## [1] "Thu Jan 31 16:12:40 2019"
clusters = quickCluster(sce, min.mean=0.1, method="igraph")
table(clusters)## clusters
## 1 2 3 4 5 6
## 503 1590 1591 2864 2062 4571
date()## [1] "Thu Jan 31 16:13:23 2019"
sce = computeSumFactors(sce, cluster=clusters, min.mean=0.1)
date()## [1] "Thu Jan 31 16:15:15 2019"
summary(sizeFactors(sce))## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0927 0.4487 0.7515 1.0000 1.2610 9.7585
sort(sizeFactors(sce))[1:30]## [1] 0.09269791 0.10089499 0.10199474 0.11052233 0.11186730 0.11451162
## [7] 0.11579893 0.11606944 0.11686686 0.11799359 0.12004139 0.12039394
## [13] 0.12144386 0.12297642 0.12451394 0.12485572 0.12649517 0.12703992
## [19] 0.13203308 0.13261365 0.13403081 0.13450431 0.13482033 0.13840999
## [25] 0.13862154 0.13927866 0.14119765 0.14164402 0.14274874 0.14293551
dim(sce)## [1] 10607 13181
sce = sce[,which(sizeFactors(sce) > 0.01)]
dim(sce)## [1] 10607 13181
par(mfrow=c(1,2), mar=c(5,4,2,1), bty="n")
smoothScatter(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors")
plot(sce$total_counts, sizeFactors(sce), log="xy",
xlab="total counts", ylab="size factors",
cex=0.3, pch=20, col=rgb(0.1,0.2,0.7,0.3))
abline(h=0.05)dim(sce)## [1] 10607 13181
sce = sce[,which(sizeFactors(sce) > 0.05)]
dim(sce)## [1] 10607 13181
sce = normalize(sce) For dimension reduction, such as calculating PCA or performing TSNE, we should start by identifying a subset of genes with high level of biological signal relative to background (technical) noise. These genes are referred to as HVGs (highly variable genes). The decomposeVar function from R/cran is designed for this task.
new_trend = makeTechTrend(x=sce)
fit = trendVar(sce, use.spikes=FALSE, loess.args=list(span=0.05))
par(mfrow=c(1,1), mar=c(5,4,2,1), bty="n")
plot(fit$mean, fit$var, pch=20, col=rgb(0.1,0.2,0.7,0.6),
xlab="log(mean)", ylab="var")
curve(fit$trend(x), col="orange", lwd=2, add=TRUE)
curve(new_trend(x), col="red", lwd=2, add=TRUE)
legend("top", legend=c("Poisson noise", "observed trend"),
lty=1, lwd=2, col=c("red", "orange"), bty="n")fit$trend = new_trend
dec = decomposeVar(fit=fit)
top_dec = dec[order(dec$bio, decreasing=TRUE),]
plotExpression(sce, features=rownames(top_dec)[1:10])When performing PCA, we can use all the genes or just those genes with high signal-to-noise ratio (HGVs). TSNE analysis is usually based on the top PCs rather than the original gene expression data. We first perform PCA using all the genes and the function denoisePCA can automatically select the PCs based on modeling of technical noise.
date()## [1] "Thu Jan 31 16:15:51 2019"
sce = denoisePCA(sce,technical=new_trend,approx=TRUE) # Using the Poisson trend fit
date()## [1] "Thu Jan 31 16:17:30 2019"
dim(reducedDim(sce,"PCA"))## [1] 13181 94
par(mfrow=c(1,1))
plot(log10(attr(reducedDim(sce), "percentVar")), xlab="PC",
ylab="log10(Prop of variance explained)", pch=20, cex=0.6,
col=rgb(0.8, 0.2, 0.2, 0.5))
abline(v=ncol(reducedDim(sce,"PCA")), lty=2, col="red")df_redDim = smart_df(colData(sce)[,c("sample_name","part_cell_id",
paste0("Habib_TSNE",1:2),"log10_total_features","Habib_clusters",
"Habib_clusters_name","Habib_cell_type")],
reducedDim(sce, "PCA")[,1:2])
rownames(df_redDim) = NULL
ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "part_cell_id",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "log10_total_features",TYPE = "cont")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "Habib_clusters",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "Habib_clusters_name",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "Habib_TSNE1",Y = "Habib_TSNE2",
COL = "Habib_cell_type",TYPE = "cat")date()## [1] "Thu Jan 31 16:17:34 2019"
sce = runTSNE(sce,use_dimred="PCA",perplexity=30,rand_seed=100)
date()## [1] "Thu Jan 31 16:20:03 2019"
tmp_df = smart_df(reducedDim(sce,"TSNE"))
rownames(tmp_df) = NULL
names(tmp_df) = paste0("my_TSNE",1:2)
df_redDim = smart_df(df_redDim,tmp_df); rm(tmp_df)
df_redDim[1:5,]## sample_name part_cell_id Habib_TSNE1 Habib_TSNE2
## 1 hHP1_AACACTATCTAC hHP1 -8.436537 7.721106
## 2 hHP1_CTACGCATCCAT hHP1 -6.606278 8.099538
## 3 hHP1_TCGGTACTAATA hHP1 -6.724967 8.011943
## 4 hHP1_CCCGCACGCTAT hHP1 -6.286364 7.892359
## 5 hHP1_TCATTTTGTCAT hHP1 -10.455725 2.374719
## log10_total_features Habib_clusters Habib_clusters_name Habib_cell_type
## 1 3.393048 4 exCA3 exCA3
## 2 3.453624 3 exCA1 exCA1
## 3 3.458638 3 exCA1 exCA1
## 4 3.419129 3 exCA1 exCA1
## 5 3.345178 3 exCA1 exCA1
## PC1 PC2 my_TSNE1 my_TSNE2
## 1 3.326051 -1.240640 20.77887 11.296457
## 2 3.747268 -1.075909 20.85935 8.749623
## 3 4.543743 -1.682305 22.32118 6.832821
## 4 4.255275 -1.072497 22.64867 7.073299
## 5 3.136629 -1.083864 20.22245 4.447358
ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "part_cell_id",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "log10_total_features",TYPE = "cont")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "Habib_clusters",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "Habib_clusters_name",TYPE = "cat")ggplot_custom(DATA = df_redDim,X = "my_TSNE1",Y = "my_TSNE2",
COL = "Habib_cell_type",TYPE = "cat")We select around top 1,000 genes (based on FDR and biological residual thresholds) as HVGs (highly variable genes).
summary(dec$bio)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.022095 0.000137 0.003208 0.009682 0.008864 5.261563
dec1 = dec
dec1$bio[which(dec$bio < 1e-5)] = 1e-5
dec1$FDR[which(dec$FDR < 1e-100)] = 1e-100
par(mfrow=c(1,2))
hist(log10(dec1$bio),breaks=100,main="")
hist(log10(dec1$FDR),breaks=100,main="")summary(dec$FDR[dec$bio > 0.01])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 1.356e-05 0.000e+00 2.618e-02
summary(dec$FDR[dec$bio > 0.03])## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 5.310e-10 0.000e+00 3.248e-07
table(dec$FDR < 1e-10,dec$bio > 0.03)##
## FALSE TRUE
## FALSE 5471 6
## TRUE 4448 682
table(dec$FDR < 1e-10,dec$bio > 0.01)##
## FALSE TRUE
## FALSE 5347 130
## TRUE 2892 2238
table(dec$FDR < 1e-10,dec$bio > 0.02)##
## FALSE TRUE
## FALSE 5455 22
## TRUE 4044 1086
w2kp = which(dec$FDR < 1e-10 & dec$bio > 0.02)
sce_hvg = sce[w2kp,]
sce_hvg## class: SingleCellExperiment
## dim: 1086 13181
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(1086): AATK AC004158.3 ... ZSWIM6 ZNF704
## rowData names(21): hgnc_symbol ensembl_gene_id ... n_cells_counts
## pct_dropout_counts
## colnames(13181): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CACGCTCCCCTA PFC-CD_CGTCATTAGCAG
## colData names(66): sample_name part_cell_id ...
## pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
edat = t(as.matrix(logcounts(sce_hvg)))
edat = scale(edat)
dim(edat)## [1] 13181 1086
edat[1:2,1:3]## AATK AC004158.3 ABLIM1
## hHP1_AACACTATCTAC -0.3259988 -0.1787744 -0.3700985
## hHP1_CTACGCATCCAT 0.1885733 -0.1787744 -0.3700985
Perform PCA and use the top 50 PCs for TSNE projection. When calculating PCs, we use log-transformed normalized gene expression data: log2(norm_express+1).
library(svd)
library(Rtsne)
date()## [1] "Thu Jan 31 16:20:15 2019"
ppk = propack.svd(edat,neig=50)
date()## [1] "Thu Jan 31 16:20:25 2019"
pca = t(ppk$d*t(ppk$u)) # calculates pc scores aka principal components
tmp_df = smart_df(pca[,1:2])
names(tmp_df) = paste0("HVG_PC",seq(ncol(tmp_df)))
df_hvg = smart_df(colData(sce)[,c("sample_name","part_cell_id",
paste0("Habib_TSNE",1:2),
"log10_total_features","Habib_clusters",
"Habib_clusters_name","Habib_cell_type")],tmp_df)
rownames(df_hvg) = NULL
set.seed(100)
date()## [1] "Thu Jan 31 16:20:25 2019"
tsne = Rtsne(pca, pca = FALSE)
date()## [1] "Thu Jan 31 16:22:49 2019"
tmp_df = smart_df(tsne$Y)
names(tmp_df) = paste0("HVG_TSNE",seq(ncol(tmp_df)))
df_hvg = smart_df(df_hvg,tmp_df)
ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "part_cell_id",TYPE = "cat")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "log10_total_features",TYPE = "cont")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "Habib_clusters",TYPE = "cat")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "Habib_clusters_name",TYPE = "cat")ggplot_custom(DATA = df_hvg,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = "Habib_cell_type",TYPE = "cat")reducedDims(sce_hvg) = SimpleList(PCA=pca, TSNE=tsne$Y)
sce_hvg## class: SingleCellExperiment
## dim: 1086 13181
## metadata(1): log.exprs.offset
## assays(2): counts logcounts
## rownames(1086): AATK AC004158.3 ... ZSWIM6 ZNF704
## rowData names(21): hgnc_symbol ensembl_gene_id ... n_cells_counts
## pct_dropout_counts
## colnames(13181): hHP1_AACACTATCTAC hHP1_CTACGCATCCAT ...
## PFC-CD_CACGCTCCCCTA PFC-CD_CGTCATTAGCAG
## colData names(66): sample_name part_cell_id ...
## pct_counts_top_50_features_Ri pct_counts_top_100_features_Ri
## reducedDimNames(2): PCA TSNE
## spikeNames(0):
Next we cluster the cells using a simple kmeans method on the top 50 PCs. We set the number of clusters to be 5 thru 15, to include the 12 cell types reported by (Habib et al. 2017).
all_num_clust = c(11:15)
df_hvg = df_hvg[,!grepl("^KM_",names(df_hvg))]
for(num_clust in all_num_clust){
cat(paste0("KM with ",num_clust," clusters.\n"))
kmeans_out = kmeans(reducedDim(sce_hvg, "PCA"), centers = num_clust,
iter.max = 1e8, nstart = 2500,
algorithm = "MacQueen")
km_label = paste0("KM_",num_clust)
df_hvg[[km_label]] = as.factor(kmeans_out$cluster)
}## KM with 11 clusters.
## KM with 12 clusters.
## KM with 13 clusters.
## KM with 14 clusters.
## KM with 15 clusters.
Next seek to cluster cells using another method SC3. The code used here is based on SC3 manual. By default, when there are more than 5000 genes, SC3 will
selects a subset of cells uniformly at random (5,000), and obtains clusters from this subset. The inferred labels can be used to train a Support Vector Machine (SVM), which is employed to assign labels to the remaining cells.
By default, SC3 filter genes to select those with dropout percentage between 10 and 90%.
summary(rowData(sce)$pct_dropout_counts)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.34 93.16 96.08 94.26 97.71 99.70
table(rowData(sce)$pct_dropout_counts < 90)##
## FALSE TRUE
## 9189 1418
This will end up with 1418 genes. However, we found the clustering resutls using these 1418 genes have considerable discrepency with clustering resutls from Kmeans and cell types reported by (Habib et al. 2017). Therefore, we chose to use those genes identified from previous step for SC3. Following the recommendation for runing SC3, we first clustering 2000 cells and then run SVM and predict labels of all other cells.
library(SC3)
rowData(sce_hvg)$feature_symbol = rowData(sce_hvg)$external_gene_name
date()## [1] "Thu Jan 31 16:58:22 2019"
all_ks = c(10,12)
sce_hvg = sc3(sce_hvg,gene_filter = FALSE,
n_cores = num_cores,ks = all_ks,biology = TRUE,
rand_seed = 100,svm_num_cells = 2000)## Setting SC3 parameters...
## Your dataset contains more than 2000 cells. Adjusting the nstart parameter of kmeans to 50 for faster performance...
## Defining training cells for SVM using svm_num_cells parameter...
## Calculating distances between the cells...
## Performing transformations and calculating eigenvectors...
## Performing k-means clustering...
## Calculating consensus matrix...
## Calculating biology...
date()## [1] "Thu Jan 31 19:03:14 2019"
dim(colData(sce_hvg))## [1] 13181 70
colData(sce_hvg)[1:2,1:5]## DataFrame with 2 rows and 5 columns
## sample_name part_cell_id Habib_clusters
## <character> <character> <factor>
## hHP1_AACACTATCTAC hHP1_AACACTATCTAC hHP1 4
## hHP1_CTACGCATCCAT hHP1_CTACGCATCCAT hHP1 3
## Habib_clusters_name Habib_cell_type
## <character> <character>
## hHP1_AACACTATCTAC exCA3 exCA3
## hHP1_CTACGCATCCAT exCA1 exCA1
names(colData(sce_hvg))## [1] "sample_name"
## [2] "part_cell_id"
## [3] "Habib_clusters"
## [4] "Habib_clusters_name"
## [5] "Habib_cell_type"
## [6] "Habib_TSNE1"
## [7] "Habib_TSNE2"
## [8] "is_cell_control"
## [9] "total_features_by_counts"
## [10] "log10_total_features_by_counts"
## [11] "total_counts"
## [12] "log10_total_counts"
## [13] "pct_counts_in_top_50_features"
## [14] "pct_counts_in_top_100_features"
## [15] "pct_counts_in_top_200_features"
## [16] "pct_counts_in_top_500_features"
## [17] "total_features"
## [18] "log10_total_features"
## [19] "pct_counts_top_50_features"
## [20] "pct_counts_top_100_features"
## [21] "pct_counts_top_200_features"
## [22] "pct_counts_top_500_features"
## [23] "total_features_by_counts_endogenous"
## [24] "log10_total_features_by_counts_endogenous"
## [25] "total_counts_endogenous"
## [26] "log10_total_counts_endogenous"
## [27] "pct_counts_endogenous"
## [28] "pct_counts_in_top_50_features_endogenous"
## [29] "pct_counts_in_top_100_features_endogenous"
## [30] "pct_counts_in_top_200_features_endogenous"
## [31] "pct_counts_in_top_500_features_endogenous"
## [32] "total_features_endogenous"
## [33] "log10_total_features_endogenous"
## [34] "pct_counts_top_50_features_endogenous"
## [35] "pct_counts_top_100_features_endogenous"
## [36] "pct_counts_top_200_features_endogenous"
## [37] "pct_counts_top_500_features_endogenous"
## [38] "total_features_by_counts_feature_control"
## [39] "log10_total_features_by_counts_feature_control"
## [40] "total_counts_feature_control"
## [41] "log10_total_counts_feature_control"
## [42] "pct_counts_feature_control"
## [43] "pct_counts_in_top_50_features_feature_control"
## [44] "pct_counts_in_top_100_features_feature_control"
## [45] "total_features_feature_control"
## [46] "log10_total_features_feature_control"
## [47] "pct_counts_top_50_features_feature_control"
## [48] "pct_counts_top_100_features_feature_control"
## [49] "total_features_by_counts_Mt"
## [50] "log10_total_features_by_counts_Mt"
## [51] "total_counts_Mt"
## [52] "log10_total_counts_Mt"
## [53] "pct_counts_Mt"
## [54] "total_features_Mt"
## [55] "log10_total_features_Mt"
## [56] "total_features_by_counts_Ri"
## [57] "log10_total_features_by_counts_Ri"
## [58] "total_counts_Ri"
## [59] "log10_total_counts_Ri"
## [60] "pct_counts_Ri"
## [61] "pct_counts_in_top_50_features_Ri"
## [62] "pct_counts_in_top_100_features_Ri"
## [63] "total_features_Ri"
## [64] "log10_total_features_Ri"
## [65] "pct_counts_top_50_features_Ri"
## [66] "pct_counts_top_100_features_Ri"
## [67] "sc3_10_clusters"
## [68] "sc3_12_clusters"
## [69] "sc3_10_log2_outlier_score"
## [70] "sc3_12_log2_outlier_score"
date()## [1] "Thu Jan 31 19:03:14 2019"
sce_hvg = sc3_run_svm(sce_hvg, ks = all_ks)
date()## [1] "Thu Jan 31 19:04:18 2019"
Next we compare the clustering results from SC3 and the reported cell types.
for(one_ks in all_ks){
sc3_label = paste0("sc3_",one_ks,"_clusters")
df_hvg[[sc3_label]] = as.factor(colData(sce_hvg)[,sc3_label])
}We obtainted the cell type and clustering resutls from (Habib et al. 2017). Supplementary Table 10: supplement nmeth.4407-S10.xlsx file. Here we compare the cell type reported by Habib et al. (2017) with ours. Habib et al. (2017) identified genes with high variation by fitting a gamma distribution on the relation between mean and coefficient of variation and chose the number of PCs based on “the largest eigen value gap”. It was not clear what is the number of PCs used. Then they used these top PCs to perform tSNE anlaysis and clustering using a graph based method.
tmp_lab = smart_RT(file.path(work_dir,"cluster_num_label.txt"),
sep = "\t",header = TRUE)
tmp_lab## Cell.ID Name Cell.Type.ID Name.1
## 1 1 exPFC1 1 exPFC
## 2 2 exPFC2 1 exPFC
## 3 3 exCA1 2 exCA1
## 4 4 exCA3 3 exCA3
## 5 5 GABA1 4 GABA
## 6 6 GABA2 4 GABA
## 7 7 exDG 5 exDG
## 8 8 ASC1 6 ASC
## 9 9 ASC2 6 ASC
## 10 10 ODC1 7 ODC
## 11 11 ODC2 7 ODC
## 12 12 OPC 8 OPC
## 13 13 MG 9 MG
## 14 14 NSC 10 NSC
## 15 15 END 11 END
tmp_lab = name_change(tmp_lab,"Name","Cluster.Name")
tmp_lab = name_change(tmp_lab,"Name.1","Cell_Type")
tmp_res = smart_RT(file.path(work_dir,"paper_cluster_res.txt"),
sep = "\t",header = TRUE,comment.char = "")
dim(tmp_res); tmp_res[1:5,]## [1] 14963 5
## Cell.ID X.Genes X.Transcripts Cluster.ID Cluster.Name
## 1 hHP1_AGACCGCGACTA 2289 3946 1 exPFC1
## 2 hHP1_AAATCGTAGTAG 1311 2077 1 exPFC1
## 3 hHP1_TCACCAGGCGAT 676 978 1 exPFC1
## 4 hHP1_GCATACAGTGAA 916 1425 1 exPFC1
## 5 hHP1_CCCCCTCAGTAC 481 641 1 exPFC1
tmp_res = name_change(tmp_res,"X.Genes","nGenes")
tmp_res = name_change(tmp_res,"X.Transcripts","nTranscripts")
tmp_res = smart_merge(tmp_res, tmp_lab[,c("Cluster.Name","Cell_Type")],
all.x=TRUE)
summary(tmp_res$nGenes)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 201.0 337.0 529.0 658.4 826.0 4249.0
df_hvg$Cell.ID = colnames(sce_hvg)
smart_table(df_hvg$Cell.ID %in% tmp_res$Cell.ID)##
## TRUE
## 13181
# Merge and compare
all_clust_res = smart_merge(df_hvg,tmp_res)
sc3_res = smart_df(colData(sce_hvg)[,paste0("sc3_",all_ks,"_clusters")])
for(ks in all_ks){
sc3_label = paste0("sc3_",ks,"_clusters")
sc3_res[,sc3_label] = as.factor(sc3_res[,sc3_label])
}
sc3_res$Cell.ID = colnames(sce_hvg)
all_clust_res = smart_merge(all_clust_res, sc3_res)
dim(all_clust_res)## [1] 13181 25
all_clust_res[1:5,]## Cell.ID sc3_10_clusters sc3_12_clusters sample_name
## 1 hCc_AAAAAGCTACAA 3 1 hCc_AAAAAGCTACAA
## 2 hCc_AAACAGGTGAGG 1 1 hCc_AAACAGGTGAGG
## 3 hCc_AAACCCTTTACA 1 1 hCc_AAACCCTTTACA
## 4 hCc_AAACGTGACGGA 1 1 hCc_AAACGTGACGGA
## 5 hCc_AAAGAACTCGCG 2 1 hCc_AAAGAACTCGCG
## part_cell_id Habib_TSNE1 Habib_TSNE2 log10_total_features Habib_clusters
## 1 hCc 2.804881 -1.888592 2.866287 1
## 2 hCc 16.965708 -6.768078 3.170262 5
## 3 hCc 17.828567 -2.514648 3.044540 5
## 4 hCc 14.756007 -4.965194 2.849419 5
## 5 hCc -1.552130 3.860966 2.716838 1
## Habib_clusters_name Habib_cell_type HVG_PC1 HVG_PC2 HVG_TSNE1
## 1 exPFC1 exPFC -3.809914 -1.148090 -2.054991
## 2 GABA1 GABA -3.041883 -1.756074 -9.557901
## 3 GABA1 GABA -4.335472 -1.149693 -8.302459
## 4 GABA1 GABA -3.614090 -1.958961 -7.234844
## 5 exPFC1 exPFC -4.001498 -2.066558 8.784212
## HVG_TSNE2 KM_11 KM_12 KM_13 KM_14 KM_15 Cluster.Name nGenes nTranscripts
## 1 6.697855 7 2 5 14 9 exPFC1 736 1034
## 2 20.722796 5 7 10 9 8 GABA1 1480 2391
## 3 18.022212 7 2 5 14 9 GABA1 1107 1757
## 4 15.676638 7 2 5 14 9 GABA1 708 935
## 5 10.251190 7 2 5 14 9 exPFC1 520 673
## Cluster.ID Cell_Type
## 1 1 exPFC
## 2 5 GABA
## 3 5 GABA
## 4 5 GABA
## 5 1 exPFC
smart_table(all_clust_res[,c("Cell_Type","Cluster.ID")])## Cluster.ID
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 0 0 0 0 0 0 0 1078 581 0 0 0 0
## END 0 0 0 0 0 0 0 0 0 0 0 0 0
## exCA1 0 0 390 0 0 0 0 0 0 0 0 0 0
## exCA3 0 0 0 737 0 0 0 0 0 0 0 0 0
## exDG 0 0 0 0 0 0 1421 0 0 0 0 0 0
## exPFC 3048 293 0 0 0 0 0 0 0 0 0 0 0
## GABA 0 0 0 0 874 772 0 0 0 0 0 0 0
## MG 0 0 0 0 0 0 0 0 0 0 0 0 281
## NSC 0 0 0 0 0 0 0 0 0 0 0 0 0
## ODC 0 0 0 0 0 0 0 0 0 1484 1155 0 0
## OPC 0 0 0 0 0 0 0 0 0 0 0 578 0
## <NA> 0 0 0 0 0 0 0 0 0 0 0 0 0
## Cluster.ID
## Cell_Type 14 15 16 18
## ASC 0 0 0 0
## END 0 83 0 0
## exCA1 0 0 0 0
## exCA3 0 0 0 0
## exDG 0 0 0 0
## exPFC 0 0 0 0
## GABA 0 0 0 0
## MG 0 0 0 0
## NSC 161 0 0 0
## ODC 0 0 0 0
## OPC 0 0 0 0
## <NA> 0 0 74 171
for(num_clust in all_num_clust){
km_label = paste0("KM_",num_clust)
print(smart_table(all_clust_res[,c("Cell_Type",km_label)]))
t2 = melt(smart_table(all_clust_res[,c("Cell_Type",km_label)]))
colnames(t2)[2] = "cluster"
print(gg.heatmap(t2,paste0("kmeans ",num_clust," clusters")))
}## KM_11
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11
## ASC 0 4 20 0 1 0 36 12 1067 1 518
## END 68 0 1 0 1 0 5 4 2 1 1
## exCA1 0 0 5 0 6 0 76 296 7 0 0
## exCA3 0 0 3 0 4 0 209 519 2 0 0
## exDG 0 0 3 0 0 0 16 1402 0 0 0
## exPFC 0 6 54 0 31 2 3131 56 46 5 10
## GABA 1 0 3 0 1172 0 452 14 4 0 0
## MG 0 1 14 0 0 223 20 11 9 1 2
## NSC 0 0 0 144 0 0 0 8 7 0 2
## ODC 0 893 1689 0 6 2 25 11 7 6 0
## OPC 0 5 6 0 3 0 23 4 3 533 1
## <NA> 0 0 17 0 55 2 13 147 9 1 1
## KM_12
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 518 34 3 0 5 1066 2 1 11 0 0 19
## END 1 6 1 68 0 2 1 1 2 0 0 1
## exCA1 0 76 118 0 0 7 5 0 179 0 0 5
## exCA3 0 76 630 0 0 1 4 0 23 0 0 3
## exDG 0 15 23 0 0 0 0 0 1380 0 0 3
## exPFC 10 3107 53 0 6 46 20 5 38 2 0 54
## GABA 0 443 37 1 0 4 1154 0 4 0 0 3
## MG 2 18 4 0 1 9 0 1 9 223 0 14
## NSC 2 0 1 0 0 7 0 0 6 0 145 0
## ODC 0 26 1 0 893 7 5 6 10 2 0 1689
## OPC 1 20 2 0 5 3 3 533 5 0 0 6
## <NA> 1 13 11 0 0 9 48 1 143 2 0 17
## KM_13
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 11 4 5 512 35 0 0 0 16 2 1069 4 1
## END 2 0 1 1 6 0 0 69 0 1 1 1 1
## exCA1 173 0 0 0 73 0 0 0 8 5 6 125 0
## exCA3 22 0 0 0 74 0 0 0 3 4 1 633 0
## exDG 1378 0 0 0 15 0 0 0 4 0 0 24 0
## exPFC 37 6 12 10 3108 2 0 0 47 19 42 53 5
## GABA 4 0 0 0 438 0 0 1 5 1157 4 37 0
## MG 10 1 1 2 16 221 0 0 15 0 9 4 2
## NSC 6 0 0 3 0 0 146 0 0 0 5 1 0
## ODC 8 685 719 0 25 1 0 0 1184 5 5 1 6
## OPC 5 6 0 1 20 0 0 0 5 3 2 2 534
## <NA> 142 0 2 1 13 2 0 0 15 48 9 12 1
## KM_14
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 0 13 4 2 755 5 0 0 1 9 354 0 483
## END 0 0 0 1 4 1 0 68 1 1 1 0 0
## exCA1 0 7 0 125 7 0 0 0 4 173 0 0 1
## exCA3 0 3 0 633 1 0 0 0 4 22 0 0 0
## exDG 0 4 0 22 0 0 0 0 0 1380 0 0 0
## exPFC 0 43 6 53 47 12 5 0 18 37 5 1 7
## GABA 0 5 0 37 6 0 0 1 1153 4 0 0 0
## MG 0 16 1 4 9 1 2 0 0 10 1 219 2
## NSC 144 0 0 1 7 0 0 0 0 6 3 0 0
## ODC 0 1186 682 1 6 720 5 0 5 8 0 1 0
## OPC 0 6 5 2 3 0 532 0 3 5 1 0 1
## <NA> 0 15 0 12 9 2 1 0 48 142 0 2 1
## KM_14
## Cell_Type 14
## ASC 33
## END 6
## exCA1 73
## exCA3 74
## exDG 15
## exPFC 3107
## GABA 440
## MG 16
## NSC 0
## ODC 25
## OPC 20
## <NA> 13
## KM_15
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12 13
## ASC 787 0 455 0 13 4 0 1 33 2 10 0 5
## END 3 0 0 68 0 0 0 1 6 1 1 0 1
## exCA1 6 0 1 0 8 0 0 4 76 118 177 0 0
## exCA3 1 0 0 0 3 0 0 4 76 630 23 0 0
## exDG 0 0 0 0 4 0 0 0 15 23 1379 0 0
## exPFC 46 0 7 0 43 5 5 18 3107 53 37 2 13
## GABA 6 0 0 1 5 0 0 1153 440 37 4 0 0
## MG 9 0 2 0 16 1 2 0 16 4 10 219 1
## NSC 5 144 1 0 0 0 0 0 0 1 7 0 0
## ODC 5 0 0 0 1172 676 2 6 25 1 8 1 740
## OPC 2 0 1 0 4 5 520 3 20 2 5 0 0
## <NA> 9 0 1 0 15 0 0 48 13 11 143 2 2
## KM_15
## Cell_Type 14 15
## ASC 0 349
## END 0 2
## exCA1 0 0
## exCA3 0 0
## exDG 0 0
## exPFC 0 5
## GABA 0 0
## MG 0 1
## NSC 0 3
## ODC 3 0
## OPC 15 1
## <NA> 1 0
for(ks in all_ks){
sc3_label = paste0("sc3_",ks,"_clusters")
print(smart_table(all_clust_res[,c("Cell_Type",sc3_label)]))
t2 = melt(smart_table(all_clust_res[,c("Cell_Type",sc3_label)]))
colnames(t2)[2] = "cluster"
print(gg.heatmap(t2,paste0("sc3 ",ks," clusters")))
}## sc3_10_clusters
## Cell_Type 1 2 3 4 5 6 7 8 9 10
## ASC 4 7 26 10 1 14 262 1321 14 0
## END 0 1 5 1 0 1 3 1 8 63
## exCA1 10 29 217 62 51 3 3 5 10 0
## exCA3 6 99 566 19 36 2 1 4 4 0
## exDG 0 3 244 1156 13 2 1 1 1 0
## exPFC 70 325 1896 30 14 43 18 182 755 8
## GABA 1133 23 189 5 3 3 1 46 242 1
## MG 3 3 18 6 1 10 13 12 28 187
## NSC 0 0 3 1 0 0 0 5 23 129
## ODC 5 8 24 7 247 1544 782 11 10 1
## OPC 5 441 18 6 2 5 3 7 5 86
## <NA> 15 2 40 93 10 11 4 16 53 1
## sc3_12_clusters
## Cell_Type 1 2 3 4 5 6 7 8 9 10 11 12
## ASC 30 1310 6 22 3 3 0 6 3 255 8 13
## END 6 1 61 3 1 0 0 1 0 1 8 1
## exCA1 52 4 2 8 50 1 1 227 1 2 32 10
## exCA3 93 1 6 3 97 4 1 517 3 0 8 4
## exDG 22 0 0 4 2 2 0 223 1 0 1166 1
## exPFC 2226 47 17 55 13 8 0 34 141 7 31 762
## GABA 668 93 9 3 581 2 4 12 44 1 7 222
## MG 18 10 191 21 1 1 0 0 1 1 32 5
## NSC 1 5 129 0 0 0 0 2 0 1 23 0
## ODC 21 8 3 2179 2 0 0 5 1 1 8 411
## OPC 16 7 443 8 2 2 0 0 1 1 94 4
## <NA> 19 7 3 15 6 3 5 31 8 1 94 53
We plot our TSNE colored by our clustering results.
all_vars = c("Cell_Type", paste0("KM_",all_num_clust),
paste0("sc3_",all_ks,"_clusters"))
for(one_var in all_vars){
print(ggplot_custom(DATA = all_clust_res,X = "HVG_TSNE1",Y = "HVG_TSNE2",
COL = one_var,TYPE = "cat"))
}Next we plot the TSNE reported by (Habib et al. 2017), colored by our clustering results.
all_vars = c("Cell_Type",paste0("KM_",all_num_clust),
paste0("sc3_",all_ks,"_clusters"))
for(one_var in all_vars){
print(ggplot_custom(DATA = all_clust_res,X="Habib_TSNE1",Y="Habib_TSNE2",
COL = one_var,TYPE = "cat"))
}Finally we save the sce object and the clustering results
saveRDS(sce,file.path(dronc_dir,"sce.rds"))
saveRDS(sce_hvg,file.path(dronc_dir,"sce_hvg.rds"))
saveRDS(all_clust_res,file.path(dronc_dir,"all_clust_res.rds"))sessionInfo()## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] SC3_1.8.0 Rtsne_0.13
## [3] svd_0.4.1 limma_3.36.5
## [5] scran_1.8.4 scater_1.8.4
## [7] ggplot2_3.0.0 biomaRt_2.36.1
## [9] DropletUtils_1.0.3 SingleCellExperiment_1.2.0
## [11] SummarizedExperiment_1.10.1 DelayedArray_0.6.4
## [13] BiocParallel_1.14.2 matrixStats_0.54.0
## [15] Biobase_2.40.0 GenomicRanges_1.32.6
## [17] GenomeInfoDb_1.16.0 IRanges_2.14.10
## [19] S4Vectors_0.18.3 BiocGenerics_0.26.0
## [21] data.table_1.12.0 BiocInstaller_1.30.0
##
## loaded via a namespace (and not attached):
## [1] ggbeeswarm_0.6.0 colorspace_1.3-2
## [3] rjson_0.2.20 class_7.3-14
## [5] dynamicTreeCut_1.63-1 rprojroot_1.3-2
## [7] XVector_0.20.0 DT_0.4
## [9] bit64_0.9-7 mvtnorm_1.0-8
## [11] AnnotationDbi_1.42.1 codetools_0.2-15
## [13] tximport_1.8.0 doParallel_1.0.11
## [15] robustbase_0.93-2 knitr_1.20
## [17] cluster_2.0.7-1 pheatmap_1.0.10
## [19] shinydashboard_0.7.0 shiny_1.1.0
## [21] rrcov_1.4-4 compiler_3.5.0
## [23] httr_1.3.1 backports_1.1.2
## [25] assertthat_0.2.0 Matrix_1.2-14
## [27] lazyeval_0.2.1 later_0.7.3
## [29] htmltools_0.3.6 prettyunits_1.0.2
## [31] tools_3.5.0 bindrcpp_0.2.2
## [33] igraph_1.2.2 gtable_0.2.0
## [35] glue_1.3.0 GenomeInfoDbData_1.1.0
## [37] reshape2_1.4.3 dplyr_0.7.6
## [39] doRNG_1.7.1 Rcpp_0.12.18
## [41] gdata_2.18.0 iterators_1.0.10
## [43] DelayedMatrixStats_1.2.0 stringr_1.3.1
## [45] mime_0.5 irlba_2.3.2
## [47] rngtools_1.3.1 gtools_3.8.1
## [49] WriteXLS_4.0.0 statmod_1.4.30
## [51] XML_3.98-1.15 DEoptimR_1.0-8
## [53] edgeR_3.22.3 zlibbioc_1.26.0
## [55] scales_1.0.0 hms_0.4.2
## [57] promises_1.0.1 rhdf5_2.24.0
## [59] RColorBrewer_1.1-2 yaml_2.2.0
## [61] memoise_1.1.0 gridExtra_2.3
## [63] pkgmaker_0.27 stringi_1.2.4
## [65] RSQLite_2.1.1 pcaPP_1.9-73
## [67] foreach_1.4.4 e1071_1.7-0
## [69] caTools_1.17.1.1 bibtex_0.4.2
## [71] rlang_0.2.1 pkgconfig_2.0.1
## [73] bitops_1.0-6 evaluate_0.11
## [75] lattice_0.20-35 ROCR_1.0-7
## [77] purrr_0.2.5 Rhdf5lib_1.2.1
## [79] bindr_0.1.1 htmlwidgets_1.2
## [81] labeling_0.3 cowplot_0.9.3
## [83] bit_1.1-14 tidyselect_0.2.4
## [85] plyr_1.8.4 magrittr_1.5
## [87] R6_2.2.2 gplots_3.0.1
## [89] DBI_1.0.0 pillar_1.3.0
## [91] withr_2.1.2 RCurl_1.95-4.11
## [93] tibble_1.4.2 crayon_1.3.4
## [95] KernSmooth_2.23-15 rmarkdown_1.10
## [97] viridis_0.5.1 progress_1.2.0
## [99] locfit_1.5-9.1 grid_3.5.0
## [101] blob_1.1.1 FNN_1.1.2.1
## [103] digest_0.6.15 xtable_1.8-2
## [105] httpuv_1.4.5 munsell_0.5.0
## [107] registry_0.5 beeswarm_0.2.3
## [109] viridisLite_0.3.0 vipor_0.4.5
Habib, Naomi, Inbal Avraham-Davidi, Anindita Basu, Tyler Burks, Karthik Shekhar, Matan Hofree, Sourav R Choudhury, et al. 2017. “Massively Parallel Single-Nucleus Rna-Seq with Dronc-Seq.” Nature Methods 14 (10). Nature Publishing Group: 955.
Lun, Aaron TL, Karsten Bach, and John C Marioni. 2016. “Pooling Across Cells to Normalize Single-Cell Rna Sequencing Data with Many Zero Counts.” Genome Biology 17 (1). BioMed Central: 75.